In [3]:
%run notebook.config.ipy


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
set database db: ../csvdb
set database anndb: /gfs/mirror/annotations/mm10_ensembl83/csvdb
set database ipydb: csvdb

Unmelt the count data to a new table


In [4]:
statement = '''select * from featurecounts'''

counts = DB.fetch_DataFrame(statement, db)
count_table = counts.pivot(columns="track", index="gene_id", values="counts")

print count_table.shape

DB.write_DataFrame(count_table,"count_table",ipydb)
#count_table.to_csv("count_table.txt",sep="\t")


(22014, 147)

Select the cells based on the QC metrics


In [10]:
# e.g. select cells:
# (i) expressing more than 3000 genes and
# (ii) where <50% reads map to spike-in sequences and
# (iii) whic have less than 7 million reads.

statement = '''select sample_id 
               from qc_summary q
               where q.cufflinks_no_genes_pc > 3000
                 and q.fraction_spike < 0.5
                 and q.total_reads < 7000000'''

good_samples = DB.fetch_DataFrame(statement, db)["sample_id"].values

print len(good_samples)


42

Prepare table of selected cells


In [11]:
sample_stat = '"' + '", "'.join(good_samples) + '"'

# fetch a pandas dataframe containing the "good" cells (samples)
statement = '''select gene_id, %(sample_stat)s
               from count_table''' % locals()

count_table_filtered = DB.fetch_DataFrame(statement, ipydb)
print count_table_filtered.shape

# write a new frame containing the filtered data
DB.write_DataFrame(count_table_filtered, "count_table_filtered", ipydb)


(22014, 43)